Gender bias in speakers and career position

Data

EcoEncontros Seminar talks

Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil

See file metadata.txt, in folder data for more description and detail of the dataset.

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)

For this specific analysis, excluding speakers that are not in academia (“others”), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.

*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.

data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student", 
                                 "postdoc","professor")

Creating dummy column to indicate if the speaker is a female (1) or not (0)

data$fem <- 1
data$fem[data$gender == "M"] <- 0 

Population data from PPGE-USP from 2008-2019

Number of students, postdocs and professors in the PPGE-USP per gender and year.

pop <- read.table("data/pop_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
kable(pop)
year student_F student_M professor_F professor_M postdoc_F postdoc_M total_F total_M
2008 51 24 4 3 2 1 57 28
2009 53 23 1 3 4 3 58 29
2010 51 24 1 4 4 3 56 31
2011 54 31 1 4 7 3 62 38
2012 52 31 2 7 8 4 62 42
2013 57 39 5 7 8 9 70 55
2014 56 39 6 6 5 12 67 57
2015 52 39 8 8 3 15 63 62
2016 51 38 9 8 4 14 64 60
2017 43 36 9 9 7 14 59 59
2018 42 33 8 9 10 12 60 54
2019 41 37 2 6 10 7 53 50

Data description

dim(data)
## [1] 330  31

Speakers data

Proportion and number of male and female speakers per academic position.

data %>% tabyl(position_cat, gender) %>% adorn_percentages("row") %>%
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>%
  kable(caption="Proportion and number (in parenthesis) of females (F) and males (M) per academic position category.")
Proportion and number (in parenthesis) of females (F) and males (M) per academic position category.
position_cat F M
student 53% (91) 47% (82)
postdoc 44% (25) 56% (32)
professor 25% (25) 75% (75)
n.y <- data %>% tabyl(position_cat, gender)
ggplot(data, aes(x=position_cat, fill=gender)) + geom_bar() + 
  ylab("Number of speakers") + xlab("") +
   scale_fill_manual("gender", values = c("#b2abd2", "#fdb863")) +
  scale_color_discrete(name="GĂȘnero") +
  theme(text = element_text(size=18),
    axis.text.x = element_text(size=16)) +
  annotate("text", x=1:3, y=n.y$M+n.y$F/2, size=5,
           label = c("52%", "43%","24%"))

ggsave("figures/numberSpeakers_position.jpeg", units="in", width=7, height=4.5, dpi=300)

Variation in time.

Origin of the speakers

There were 143 (43%) talks given by people from the PPGE population.

Including talks from the Institute of Biosciences, USP, there were 180 (55%).

PPGE-USP population data

PPGE-USP population size by gender in time

PPGE-USP population size by gender and position and year

pop2 %>% mutate(ytext = M + F/2) %>%
  pivot_longer(7:8, names_to = "gender", values_to = "N") %>%
  ggplot(aes(x=as.factor(year),y=N, fill=gender)) + geom_col() + 
  facet_wrap(~category, scales="free",ncol=1)+
  ylab("N") + xlab("") +
   scale_fill_manual("gender", values = c("#b2abd2", "#fdb863")) +
  theme(text = element_text(size=18),
    axis.text.x = element_text(size=16, angle=45, hjust=1)) +
  geom_text(aes(x=as.factor(year), y=ytext,
                label=paste0(round(propFcat*100), "%") ))

ggsave("figures/popSize_positionYear.jpeg", units="in", width=7, height=14, dpi=300)

Comparing proportions of female speakers in the seminar and in the population by position.

propS <- data %>% group_by(year,position_cat,gender) %>% 
  count() %>% 
  pivot_wider(names_from = gender, values_from = n, values_fill = 0) %>%
  mutate(propFspeaker = F/(M+F)) 
proportions <- pop2 %>% dplyr::select(year, category, propFcat, F,M) %>% 
  rename("position_cat"="category") %>%
  left_join(propS, by=c("year", "position_cat")) %>%
  mutate(position_cat = fct_relevel(position_cat, "student", "postdoc", "professor"))
ggplot(proportions,aes(x=propFcat, y=propFspeaker, col=year)) +
  scale_color_gradient()+
  geom_point() +
  facet_wrap(~position_cat)+
  xlim(0,1) +
  xlab("Proportion of females in the population") +
  ylab("Proportion of females as speakers")+
  geom_abline(slope=1,intercept=0, linetype="dashed") +
  geom_hline(yintercept = 0.5, linetype="dashed", col="lightgray")+
  geom_vline(xintercept = 0.5, linetype="dashed", col="lightgray")
Relationship between the proportion of females in the PPGE population in each category of academic position in each year (x axis) and the proportion of female speakers in the same category and year. Horizontal and vertical gray dashed lines are the 50% of each proportion and diagonal black dashed line indicates where the propotions are similar.

Relationship between the proportion of females in the PPGE population in each category of academic position in each year (x axis) and the proportion of female speakers in the same category and year. Horizontal and vertical gray dashed lines are the 50% of each proportion and diagonal black dashed line indicates where the propotions are similar.

ggsave("figures/propFemale_popXspeaker.jpeg", units="in", width=14, height=5, dpi=300)

By year

proportions %>% pivot_longer(c(3,8), names_to = "data",values_to = "proportion") %>%
ggplot(aes(x=year, y=proportion, col=data)) +
  geom_point() +
  scale_color_manual(name="Dataset",values = 1:2,
                     labels=c("PPGE population", "Speakers pop"))+
  geom_smooth(method="lm") +
  facet_wrap(~position_cat, ncol=1)+
  ylab("Proportion of females")+
  geom_hline(yintercept = 0.5, linetype="dashed", col="lightgray")

ggsave("figures/propFemale_popXspeaker_byYear.jpeg", units="in", width=7, height=10, dpi=300)

Modeling

Proportions of female speakers by academic position and time.

Binomial models with the response variable as 0 if the speaker is male or 1 if female. Response varibles as year and academic position.

OBS: Starting in 2018, the Ecoencontros students committee tried actively to balance gender in presentations as an affirmative policy in the group. Becase of that we also analyzed if the proportions varied between before and after the policy.

data$affirm_action <- ifelse(data$year<2018,"before", "after")
data$affirm_action <- fct_relevel(data$affirm_action,"before", "after")

There were 74 seminars before and 256 after the affirmative actions.

mod0 <- glm(fem ~ 1, family=binomial, data= data)
mod1 <- glm(fem ~ year, family=binomial, data= data)
mod2 <- glm(fem ~ affirm_action, family=binomial, data= data)
mod3 <- glm(fem ~ position_cat, family=binomial, data= data)

mod4 <- glm(fem ~ position_cat + year, family=binomial, data= data)
mod5 <- glm(fem ~ position_cat*year, family=binomial, data= data)
mod6 <- glm(fem ~ position_cat + affirm_action, family=binomial, data= data)
mod7 <- glm(fem ~ position_cat*affirm_action, family=binomial, data= data)

kable(AICtab(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7, base=T, weights=T), digits=2)
AIC dAIC df weight
mod7 434.13 0.00 6 0.42
mod5 434.98 0.84 6 0.28
mod3 435.98 1.85 3 0.17
mod6 437.56 3.43 4 0.08
mod4 437.98 3.85 4 0.06
mod0 452.47 18.34 1 0.00
mod2 453.11 18.98 2 0.00
mod1 454.42 20.28 2 0.00
#as.data.frame(AICtab(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7, base=T, weights=T)) %>%
#  mutate_at(c(1,2,4), round, digits=2) %>% kable()

Best model is the one with interaction between academic position and affirmative actions (as time event), but it is equaly plausible with the model with year as time event and only academic position.

Residual diagnostic of the selected models

All equaly plausible models presented satisfactory residual diagnostic.

hnp(mod7)
## Binomial model

hnp(mod5)
## Binomial model

hnp(mod3)
## Binomial model

plot(simulateResiduals(mod7))

plot(simulateResiduals(mod5))

plot(simulateResiduals(mod3))

Models results

summary(mod7)
## 
## Call:
## glm(formula = fem ~ position_cat * affirm_action, family = binomial, 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5518  -1.1073  -0.6945   1.0901   1.7552  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                                0.2088     0.1799   1.161   0.2458    
## position_catpostdoc                       -0.6857     0.3498  -1.960   0.0500 *  
## position_catprofessor                     -1.5080     0.3210  -4.698 2.63e-06 ***
## affirm_actionafter                        -0.3758     0.3410  -1.102   0.2704    
## position_catpostdoc:affirm_actionafter     1.7000     0.8261   2.058   0.0396 *  
## position_catprofessor:affirm_actionafter   1.4238     0.6640   2.144   0.0320 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.47  on 329  degrees of freedom
## Residual deviance: 422.13  on 324  degrees of freedom
## AIC: 434.13
## 
## Number of Fisher Scoring iterations: 4
performance::r2(mod7)
## # R2 for Logistic Regression
##   Tjur's R2: 0.082
my7 <- ggpredict(mod7, terms=c("position_cat","affirm_action"))
plot(my7) +
  geom_hline(yintercept = 0.5, linetype="dashed")

Figure proportion of female speakers by position_cat e affirm_action

suma <- data %>% count(position_cat, affirm_action,fem)
prs <- as.data.frame(my7)

ggplot(suma, aes(x=position_cat, y=fem,col=affirm_action))+
  geom_point(aes(, size=n),position=position_dodge(0.6), alpha=0.2, show_guides=F) +
  scale_size(range=c(1,10), breaks = c(3,10,20,60))+
  geom_pointrange(data=prs, aes(x=x, y=predicted, col=group,
                                ymax=conf.high, ymin=conf.low), 
             position=position_dodge(0.6)) +
  geom_hline(yintercept = 0.5, linetype="dashed", col='gray') +
  scale_color_manual(name="Affirmative \n actions", 
                     values = c("goldenrod", "green4")) +
  ylab("Proportion of female speakers")+
  xlab("Academic position")+
  theme(text = element_text(size=20),
        axis.text = element_text(size=18))

ggsave("figures/prop_female_speakers.jpeg", width=9, height = 6)  
summary(mod5)
## 
## Call:
## glm(formula = fem ~ position_cat * year, family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4358  -1.0948  -0.7194   1.1282   1.8046  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                 126.68013   86.87568   1.458   0.1448  
## position_catpostdoc        -482.23276  199.27943  -2.420   0.0155 *
## position_catprofessor      -255.41782  187.46781  -1.362   0.1731  
## year                         -0.06284    0.04313  -1.457   0.1451  
## position_catpostdoc:year      0.23924    0.09893   2.418   0.0156 *
## position_catprofessor:year    0.12622    0.09308   1.356   0.1751  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.47  on 329  degrees of freedom
## Residual deviance: 422.98  on 324  degrees of freedom
## AIC: 434.98
## 
## Number of Fisher Scoring iterations: 4
performance::r2(mod5)
## # R2 for Logistic Regression
##   Tjur's R2: 0.081
my5 <- ggpredict(mod5, terms=c("year", "position_cat"))
plot(my5) +
  geom_hline(yintercept = 0.5, linetype="dashed")

summary(mod3)
## 
## Call:
## glm(formula = fem ~ position_cat, family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2219  -1.0745  -0.7585   1.1335   1.6651  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.1041     0.1523   0.684    0.494    
## position_catpostdoc    -0.3510     0.3073  -1.142    0.253    
## position_catprofessor  -1.2028     0.2766  -4.348 1.37e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.47  on 329  degrees of freedom
## Residual deviance: 429.98  on 327  degrees of freedom
## AIC: 435.98
## 
## Number of Fisher Scoring iterations: 4
performance::r2(mod3)
## # R2 for Logistic Regression
##   Tjur's R2: 0.060
my3 <- ggpredict(mod3, terms=c("position_cat"))
plot(my3) +
  geom_hline(yintercept = 0.5, linetype="dashed")

Proportions of female speakers by academic position and time - controling by population proportions

The models below controls by differences in gender ratios in PPGE population by academic position. For that, we included a control variable in every model in the model set with the ratio of females in the year and by position.

data <- data %>% left_join(pop2[,c(1:2,10)], by=c("year", "position_cat"="category"))
modc0 <- glm(fem ~ 1 + ratioFcat, family=binomial, data= data)
modc1 <- glm(fem ~ year + ratioFcat, family=binomial, data= data)
modc2 <- glm(fem ~ affirm_action + ratioFcat, family=binomial, data= data)
modc3 <- glm(fem ~ position_cat+ ratioFcat, family=binomial, data= data)

modc4 <- glm(fem ~ position_cat + year + ratioFcat, family=binomial, 
             data= data)
modc5 <- glm(fem ~ position_cat*year + ratioFcat, family=binomial, data= data)
modc6 <- glm(fem ~ position_cat +affirm_action + ratioFcat, family=binomial,
             data= data)
modc7 <- glm(fem ~ position_cat*affirm_action + ratioFcat, family=binomial,
             data= data)

AICtab(modc0,modc1,modc2,modc3,modc4,modc5,modc6,modc7, base=T, weights=T)
##       AIC   dAIC  df weight
## modc7 436.1   0.0 7  0.364 
## modc5 436.9   0.7 7  0.252 
## modc3 437.6   1.5 4  0.173 
## modc6 438.9   2.7 5  0.093 
## modc4 439.4   3.3 5  0.070 
## modc1 442.2   6.1 3  0.018 
## modc2 442.2   6.1 3  0.018 
## modc0 442.8   6.7 2  0.013

Similar results as without the ratio of females in the PPGE population

Residual diagnostic of the selected models

All equaly plausible models presented satisfactory residual diagnostic.

par(mfrow=c(2,2))
hnp(modc7)
## Binomial model
hnp(modc5)
## Binomial model
hnp(modc3)
## Binomial model

plot(simulateResiduals(modc7))

plot(simulateResiduals(modc5))

plot(simulateResiduals(modc3))   

Comparing both models - controling and not controling

AICtab(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7,
       modc0,modc1,modc2,modc3,modc4,modc5,modc6,modc7, base=T, weights=T)
##       AIC   dAIC  df weight
## mod7  434.1   0.0 6  0.2948
## mod5  435.0   0.8 6  0.1933
## mod3  436.0   1.8 3  0.1169
## modc7 436.1   2.0 7  0.1088
## modc5 436.9   2.7 7  0.0752
## mod6  437.6   3.4 4  0.0531
## modc3 437.6   3.5 4  0.0517
## mod4  438.0   3.8 4  0.0430
## modc6 438.9   4.7 5  0.0278
## modc4 439.4   5.3 5  0.0208
## modc1 442.2   8.0 3  0.0053
## modc2 442.2   8.1 3  0.0052
## modc0 442.8   8.7 2  0.0038
## mod0  452.5  18.3 1  <0.001
## mod2  453.1  19.0 2  <0.001
## mod1  454.4  20.3 2  <0.001

Models results

Using 1:1 population gender ratio

summary(modc7)
## 
## Call:
## glm(formula = fem ~ position_cat * affirm_action + ratioFcat, 
##     family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5551  -1.1067  -0.6921   1.0936   1.7596  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                               -0.5000     0.4062  -1.231   0.2184  
## position_catprofessor                     -0.8154     0.4094  -1.992   0.0464 *
## position_catstudent                        0.6668     0.4150   1.607   0.1081  
## affirm_actionafter                         1.3190     0.7550   1.747   0.0806 .
## ratioFcat                                  0.0250     0.2966   0.084   0.9328  
## position_catprofessor:affirm_actionafter  -0.2694     0.9474  -0.284   0.7762  
## position_catstudent:affirm_actionafter    -1.6822     0.8525  -1.973   0.0485 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.47  on 329  degrees of freedom
## Residual deviance: 422.13  on 323  degrees of freedom
## AIC: 436.13
## 
## Number of Fisher Scoring iterations: 4
performance::r2(modc7)
## # R2 for Logistic Regression
##   Tjur's R2: 0.082
myc7 <- ggpredict(modc7, terms=c("position_cat","affirm_action",
                                 "ratioFcat[1]"))
plot(myc7) +
  geom_hline(yintercept = 0.5, linetype="dashed")

summary(modc5)
## 
## Call:
## glm(formula = fem ~ position_cat * year + ratioFcat, family = binomial, 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4938  -1.0979  -0.7081   1.1347   1.8100  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                -381.95004  196.57186  -1.943   0.0520 .
## position_catprofessor       264.94673  270.94859   0.978   0.3281  
## position_catstudent         481.68398  199.33201   2.416   0.0157 *
## year                          0.18944    0.09751   1.943   0.0520 .
## ratioFcat                     0.13239    0.39651   0.334   0.7385  
## position_catprofessor:year   -0.13193    0.13450  -0.981   0.3266  
## position_catstudent:year     -0.23900    0.09896  -2.415   0.0157 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.47  on 329  degrees of freedom
## Residual deviance: 422.87  on 323  degrees of freedom
## AIC: 436.87
## 
## Number of Fisher Scoring iterations: 4
performance::r2(modc5)
## # R2 for Logistic Regression
##   Tjur's R2: 0.081
myc5 <- ggpredict(modc5, terms=c("year", "position_cat",
                                 "ratioFcat[1]"))
plot(myc5) +
  geom_hline(yintercept = 0.5, linetype="dashed")

summary(modc3)
## 
## Call:
## glm(formula = fem ~ position_cat + ratioFcat, family = binomial, 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2758  -1.1441  -0.7425   1.1473   1.6932  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)  
## (Intercept)            -0.4022     0.3711  -1.084    0.278  
## position_catprofessor  -0.7993     0.3637  -2.198    0.028 *
## position_catstudent     0.2581     0.3433   0.752    0.452  
## ratioFcat               0.1616     0.2669   0.605    0.545  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.47  on 329  degrees of freedom
## Residual deviance: 429.62  on 326  degrees of freedom
## AIC: 437.62
## 
## Number of Fisher Scoring iterations: 4
performance::r2(modc3)
## # R2 for Logistic Regression
##   Tjur's R2: 0.061
myc3 <- ggpredict(modc3, terms=c("position_cat", "ratioFcat[1]"))
plot(myc3) +
  geom_hline(yintercept = 0.5, linetype="dashed")